# Replication File for Seljan and Gronke, 2022, "Happy Birthday You Get
# To Vote". 
#
# OREGON: Anonymize 2016 voter file for distribution 
#
#
# This file takes as input a January 2017 voter list file, obtainable
# from the Division of Elections, State of Oregon. This file represents
# the state of the Oregon central voter registration system after the 
# first year of implementation of AVR and the 2016 presidental election.
#
# We have not distributed the raw data file because it contains personally 
# identifiable information for millions of individual registrants. 
#
# The output files contain all data elements necessary to replicate the 
# full analysis.

library(data.table)
library(dplyr)
library(lubridate)
library(mosaic)
library(stringr)
library(AER)
library(broom)
library(tidyverse)
library(tidymodels)
library(stargazer)
library(radiant)
library(openxlsx)
library(wru)
library(gender)
library(genderdata)
library(fastDummies)

options(scipen = 100, digits = 3)

# Oregon 2016 Voter File --------------------------------------------------


#Readng data in and converting variables to string 

vhcd1<-read.delim(file.path(data_dir, "original_data/StatewideVoterList_January2017/VoterHistory_Jan2017/CD1_VoterHistory_Jan2017.txt"))
vhcd2<-read.delim(file.path(data_dir, "original_data/StatewideVoterList_January2017/VoterHistory_Jan2017/CD2_VoterHistory_Jan2017.txt"))
vhcd3<-read.delim(file.path(data_dir, "original_data/StatewideVoterList_January2017/VoterHistory_Jan2017/CD3_VoterHistory_Jan2017.txt"))
vhcd4<-read.delim(file.path(data_dir, "original_data/StatewideVoterList_January2017/VoterHistory_Jan2017/CD4_VoterHistory_Jan2017.txt"))
vhcd5<-read.delim(file.path(data_dir, "original_data/StatewideVoterList_January2017/VoterHistory_Jan2017/CD5_VoterHistory_Jan2017.txt"))

vhcd1<-data.frame(lapply(vhcd1, as.character), stringsAsFactors=FALSE)
vhcd2<-data.frame(lapply(vhcd2, as.character), stringsAsFactors=FALSE)
vhcd3<-data.frame(lapply(vhcd3, as.character), stringsAsFactors=FALSE)
vhcd4<-data.frame(lapply(vhcd4, as.character), stringsAsFactors=FALSE)
vhcd5<-data.frame(lapply(vhcd5, as.character), stringsAsFactors=FALSE)

vlcd1<-read.delim(file.path(data_dir, "original_data/StatewideVoterList_January2017/VoterList_Jan2017/CD1_VoterList_Jan2017.txt"))
vlcd2<-read.delim(file.path(data_dir, "original_data/StatewideVoterList_January2017/VoterList_Jan2017/CD2_VoterList_Jan2017.txt"))
vlcd3<-read.delim(file.path(data_dir, "original_data/StatewideVoterList_January2017/VoterList_Jan2017/CD3_VoterList_Jan2017.txt"))
vlcd4<-read.delim(file.path(data_dir, "original_data/StatewideVoterList_January2017/VoterList_Jan2017/CD4_VoterList_Jan2017.txt"))
vlcd5<-read.delim(file.path(data_dir, "original_data/StatewideVoterList_January2017/VoterList_Jan2017/CD5_VoterList_Jan2017.txt"))

#Converting all variables to string

vlcd1<-data.frame(lapply(vlcd1, as.character), stringsAsFactors=FALSE)
vlcd2<-data.frame(lapply(vlcd2, as.character), stringsAsFactors=FALSE)
vlcd3<-data.frame(lapply(vlcd3, as.character), stringsAsFactors=FALSE)
vlcd4<-data.frame(lapply(vlcd4, as.character), stringsAsFactors=FALSE)
vlcd5<-data.frame(lapply(vlcd5, as.character), stringsAsFactors=FALSE)

#Combining the dataasets
voterhistory<-rbind(vhcd1,vhcd2,vhcd3,vhcd4,vhcd5)
voterlist<-rbind(vlcd1,vlcd2,vlcd3,vlcd4,vlcd5)


# Merging voter roll and voter history
OR_2016<-inner_join(voterlist,voterhistory,by="VOTER_ID") 


OR_2016<-OR_2016 %>%
  select(VOTER_ID, FIRST_NAME.x, LAST_NAME.x,BIRTH_DATE.x, EFF_REGN_DATE.x, 
       PARTY_CODE.x, COUNTY.x, X11.08.2016, X11.04.2014, X11.06.2012, X05.17.2016,
       X05.17.2016, X05.19.2015, X11.04.2014, X05.20.2014, X11.05.2013, X05.21.2013, 
       X11.06.2012, X05.15.2012, X01.31.2012, X11.08.2011, X05.17.2011, X11.02.2010, 
       X05.18.2010, X01.26.2010, X11.04.2009, X05.19.2009, X11.04.2008, X05.20.2008, 
       X11.06.2007, X05.15.2007, X11.07.2006, X05.17.2016, X05.19.2015, X11.04.2014, 
       X05.20.2014, X11.05.2013, X05.21.2013, X11.06.2012, X05.15.2012, X01.31.2012, 
       X11.08.2011, X05.17.2011, X11.02.2010, X05.18.2010, X01.26.2010, X11.04.2009, 
       X05.19.2009, X11.04.2008, X05.20.2008, X11.06.2007, X05.15.2007, X11.07.2006)


# Merge in OMV data
OMV_2016 <- read.table(file.path(data_dir, "original_data/StatewideVoterList_January2017/OMV_Jan2017.txt"), header=TRUE)
OMV_2016<-OMV_2016 %>% select(VOTER_ID, DESCRIPTION)
OMV_2016$VOTER_ID<-as.character(OMV_2016$VOTER_ID)
OR_2016<-left_join(OR_2016, OMV_2016)
OR_2016$DESCRIPTION<-as.factor(ifelse(is.na(OR_2016$DESCRIPTION), "NA", OR_2016$DESCRIPTION)) 
levels(OR_2016$DESCRIPTION) <- c("OMV Phase 1", "OMV Phase2", "Traditional")

##read birthdates and registration dates as lubridates
OR_2016$lub_birthdate<-mdy(OR_2016$BIRTH_DATE.x)
OR_2016$lub_regdate<-mdy(OR_2016$EFF_REGN_DATE.x)

# Registration year variables
OR_2016$reg_year<-year(OR_2016$lub_regdate)
OR_2016$regyear2016<-ifelse(OR_2016$reg_year==2016, 1, 0)
OR_2016$regyear2014<-ifelse(OR_2016$reg_year==2014, 1, 0)
OR_2016$regyear2012<-ifelse(OR_2016$reg_year==2012, 1, 0)

# Create voter history by registration year dummy


OR_2016<-OR_2016  %>%
  mutate(voterhistory2016=if_else(X11.04.2014=="-" ,0,1)) %>%
  mutate(voterhistory2015=if_else(X11.04.2014=="-"  ,0,1))%>%
  mutate(voterhistory2014=if_else(X11.06.2012=="-" ,0,1)) %>%
  mutate(voterhistory2013=if_else(X11.06.2012=="-"  ,0,1)) %>%
  mutate(voterhistory2012=if_else(X11.02.2010=="-" ,0,1)) %>%
  mutate(voterhistory2011=if_else(X11.02.2010=="-" ,0,1)) %>%
  mutate(voterhistory2010=if_else(X11.04.2008=="-" ,0,1)) %>%
  mutate(voterhistory2009=if_else(X11.04.2008=="-" ,0,1)) %>%
  mutate(voterhistory2008=if_else(X11.07.2006=="-" ,0,1)) %>%
  mutate(voterhistory2007=if_else(X11.07.2006=="-" ,0,1)) %>%
  mutate(voterhistory_byregyear=case_when(reg_year==2016 & voterhistory2016==1 ~ 1,
                                          reg_year==2015 & voterhistory2015==1 ~ 1,
                                          reg_year==2014 & voterhistory2014==1 ~ 1,
                                          reg_year==2013 & voterhistory2013==1 ~ 1,
                                          reg_year==2012 & voterhistory2012==1 ~ 1,
                                          reg_year==2011 & voterhistory2011==1 ~ 1,
                                          reg_year==2010 & voterhistory2010==1 ~ 1,
                                          reg_year==2009 & voterhistory2009==1 ~ 1,
                                          reg_year==2008 & voterhistory2008==1 ~ 1,
                                          reg_year==2007 & voterhistory2007==1 ~ 1,
                                          TRUE~ 0))


# Compute age at November 2016 General Election accounting for leap years
electionday2016<-yday(ymd(161108))
OR_2016<-OR_2016 %>% mutate(age_at_2016election = case_when(birth_day_of_year<=electionday2016 & leap_year(lub_birthdate)==TRUE ~ 2016-birth_year, 
                                                            birth_day_of_year+1<=electionday2016 & leap_year(lub_birthdate)==FALSE ~ 2016-birth_year, 
                                                            birth_day_of_year>electionday2016 & leap_year(lub_birthdate)==TRUE ~ 2016-birth_year-1,
                                                            TRUE ~ 2016-birth_year-1))

#Limit data to registered voters 18 or over at time of election.
OR_2016<-OR_2016 %>%
  filter(age_at_2016election<110 & age_at_2016election>17)  


#Create populous county dummy (Defined as Counties over 150K)
OR_2016<-OR_2016 %>% mutate(populous_county=case_when(COUNTY.x=="MULTNOMAH" ~ 1, 
                                                      COUNTY.x=="WASHINGTON" ~ 1,
                                                      COUNTY.x=="CLACKAMAS" ~ 1,
                                                      COUNTY.x=="LANE" ~ 1,
                                                      COUNTY.x=="MARION" ~ 1,
                                                      COUNTY.x=="JACKSON" ~ 1,
                                                      COUNTY.x=="DESCHUTES" ~ 1,
                                                      TRUE~0))
# Add in County FIPS
OR_fips<- tibble::tribble(
            ~county,    ~COUNTY.x,
                 1L,      "BAKER",
                 3L,     "BENTON",
                 5L,  "CLACKAMAS",
                 7L,    "CLATSOP",
                 9L,   "COLUMBIA",
                11L,       "COOS",
                13L,      "CROOK",
                15L,      "CURRY",
                17L,  "DESCHUTES",
                19L,    "DOUGLAS",
                21L,    "GILLIAM",
                23L,      "GRANT",
                25L,     "HARNEY",
                27L, "HOOD RIVER",
                29L,    "JACKSON",
                31L,  "JEFFERSON",
                33L,  "JOSEPHINE",
                35L,    "KLAMATH",
                37L,       "LAKE",
                39L,       "LANE",
                41L,    "LINCOLN",
                43L,       "LINN",
                45L,    "MALHEUR",
                47L,     "MARION",
                49L,     "MORROW",
                51L,  "MULTNOMAH",
                53L,       "POLK",
                55L,    "SHERMAN",
                57L,  "TILLAMOOK",
                59L,   "UMATILLA",
                61L,      "UNION",
                63L,    "WALLOWA",
                65L,      "WASCO",
                67L, "WASHINGTON",
                69L,    "WHEELER",
                71L,    "YAMHILL"
            )

OR_2016<-left_join(OR_2016,OR_fips,by="COUNTY.x")

# Impute Race Variables
OR_2016$surname<-OR_2016$LAST_NAME.x
OR_census_race<-get_census_data(key="ae147f31d2e5acfa2a6f8cafc8befe65dfd6d462", states="OR", age = FALSE, sex = FALSE,
                                census.geo = "county", retry = 0)
OR_2016$state<-"OR"
OR_2016<-predict_race(voter.file = OR_2016, census.data = OR_census_race, census.geo = "county", census.key ="ae147f31d2e5acfa2a6f8cafc8befe65dfd6d462")

OR_2016$white <-ifelse(OR_2016$pred.whi>OR_2016$pred.bla & OR_2016$pred.whi>OR_2016$pred.his & OR_2016$pred.whi>OR_2016$pred.asi & OR_2016$pred.whi>OR_2016$pred.oth,1,0)
OR_2016$black <-ifelse(OR_2016$pred.bla>OR_2016$pred.whi & OR_2016$pred.bla>OR_2016$pred.his & OR_2016$pred.bla>OR_2016$pred.asi & OR_2016$pred.bla>OR_2016$pred.oth,1,0)
OR_2016$hispanic <-ifelse(OR_2016$pred.his>OR_2016$pred.bla & OR_2016$pred.his>OR_2016$pred.whi & OR_2016$pred.his>OR_2016$pred.asi & OR_2016$pred.his>OR_2016$pred.oth,1,0)
OR_2016$other_race <-ifelse(OR_2016$pred.oth>OR_2016$pred.bla & OR_2016$pred.oth>OR_2016$pred.whi & OR_2016$pred.oth>OR_2016$pred.asi & OR_2016$pred.oth>OR_2016$pred.his,1,0)
OR_2016$asian <-ifelse(OR_2016$pred.asi>OR_2016$pred.bla & OR_2016$pred.asi>OR_2016$pred.whi & OR_2016$pred.asi>OR_2016$pred.his & OR_2016$pred.asi>OR_2016$pred.oth,1,0)

OR_2016<-select(OR_2016,-c(pred.bla, pred.his, pred.asi, pred.whi, pred.oth))

# Predict gender

gender<-gender_df(OR_2016, name_col = "FIRST_NAME.x", year_col = "birth_year")
gender$gender_female <-ifelse(gender$proportion_female>=0.5,1,0)
gender$birth_year<-gender$year_min
gender$FIRST_NAME.x<-gender$name
gender<-select(gender,-year_min)
gender<-select(gender,-year_max)
gender<-select(gender,-proportion_male)
gender<-select(gender,-proportion_female)
gender<-select(gender,-name)
gender<-select(gender,-gender)

# Limit gender data 
gender<-gender %>%
  filter(birth_year>1905 & birth_year<2000)  

gender<-gender %>% distinct(gender_female, birth_year, FIRST_NAME.x)

OR_2016<-left_join(OR_2016,gender,by= c("FIRST_NAME.x", "birth_year"))

OR_2016$gender3<-OR_2016$gender_female %>%  replace_na(2)
OR_2016$gender_female_dummy<-OR_2016$gender_female %>%  replace_na(0)
OR_2016 <- dummy_cols(OR_2016, select_columns = 'gender3')


# Limit data so anonymous by individual attributes

OR_2016<-OR_2016  %>%
  select(BIRTH_DATE.x, EFF_REGN_DATE.x, PARTY_CODE.x, DESCRIPTION, X11.06.2012, X11.04.2014, X11.08.2016, lub_regdate,
         lub_birthdate, reg_year, voterhistory_byregyear, age_at_2016election, populous_county, white, black, hispanic, other_race,
         asian, gender_female, gender3_0, gender3_1, gender3_2) 

saveRDS(OR_2016, file="OR2016_anon.RDS")


